DATA 698 : Capstone Research Project
Capstone Project on CPI and Employment
DATA 698 : Capstone Research Project
1 Overview
We are in totally different era of twenty first century, and it gives us very rare situation where any positive news would help the humankind. We want to use the historical CPI data and find the relation of it with the employment, in hope that we would have some positive news on employment by following the trend of the data in past.
We feel that lower CPI would result in more job opportunity, as the it gives space for more competition in small business across sectors.
2 Capstone Project on CPI and Employment
- The estimates of employment for 1998-2006 are based on the 2002 North American Industry Classification System (NAICS). The estimates for 2007-2010 are based on the 2007 NAICS. The estimates for 2011-2016 are based on the 2012 NAICS. The estimates for 2017 forward are based on the 2017 NAICS.
- Excludes limited partners.
- Under the 2007 NAICS, internet publishing and broadcasting was reclassified to other information services.
- Not shown to avoid disclosure of confidential information; estimates are included in higher-level totals.
- Estimate for employment suppressed to cover corresponding estimate for earnings. Estimates for this item are included in the total.
- (NA) Not available.
- (NM) Not meaningful.
Last updated: September 24, 2019- new statistics for 2018; revised statistics for 2014-2017.
3 Data Preparation
Load the required libraries
#packages <- c("R.rsp","pandocfilters","knitr","plyr","tidyr","dplyr","ggplot2","plotly","sqldf","MASS","reshape2","Amelia","mice","googleVis","stringi","ROCR","scatterplot3d","visdat","gridExtra","sjmisc")
#install.packages (packages, repos="http://cran.us.r-project.org", dependencies=TRUE, lib="C:/Program Files/R/R-3.6.3/library")
#invisible (lapply (packages, library, character.only = T))
#install.packages("devtools", lib="C:/Program Files/R/R-3.6.3/library")
#devtools::install_github("hadley/ggplot2")
library(tidyverse)
library(kableExtra)
library(lubridate)
library(forecast)
library(stringi)
library(dplyr)3.1 Load Employment Datasets
file_path <- "../data/cpi/SAINC/"
EMP_DATA_FILE <- paste0(file_path,"EMP9818.csv")
EMP_DATA <- read_delim(EMP_DATA_FILE, delim = ",", col_names = TRUE, trim_ws= TRUE)
#,skip = 6,col_types = list(col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character()))
# names(SAP_INV_DAT) = c("DROP","I_BILLDATE","I_ORG","I_DC","I_DOCCA","I_BILLTYPE","I_ORDER_REA","I_DIV","I_MATYPE","I_BILL_QTY","BLANK")
# SAP_INV_DAT<- SAP_INV_DAT[-which(is.na(SAP_INV_DAT$I_ORG)),] %>% .[-str_which(trimws(.$I_ORG),'SOrg.|-----'),c(-1,-11)]
# 3
# SAP_INV_DAT$I_BILLDATE <- date(parse_datetime(SAP_INV_DAT$I_BILLDATE, "%m/%d/%Y"))
#
# head(SAP_INV_DAT)3.2 Show Relevant Employment Data
## GeoFIPS GeoName Region TableName
## Length:7084 Length:7084 Min. :1.000 Length:7084
## Class :character Class :character 1st Qu.:3.000 Class :character
## Mode :character Mode :character Median :5.000 Mode :character
## Mean :4.475
## 3rd Qu.:6.000
## Max. :8.000
## NA's :122
## LineCode IndustryClassification Description Unit
## Min. : 10.0 Length:7084 Length:7084 Length:7084
## 1st Qu.: 517.0 Class :character Class :character Class :character
## Median : 712.5 Mode :character Mode :character Mode :character
## Mean : 872.1
## 3rd Qu.:1103.0
## Max. :2012.0
## NA's :4
## 1998 1999 2000 2001
## Length:7084 Length:7084 Length:7084 Length:7084
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## 2002 2003 2004 2005
## Length:7084 Length:7084 Length:7084 Length:7084
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## 2006 2007 2008 2009
## Length:7084 Length:7084 Length:7084 Length:7084
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## 2010 2011 2012 2013
## Length:7084 Length:7084 Length:7084 Length:7084
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## 2014 2015 2016 2017
## Length:7084 Length:7084 Length:7084 Length:7084
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## 2018
## Length:7084
## Class :character
## Mode :character
##
##
##
##
## Observations: 5,760
## Variables: 24
## $ LineCode <dbl> 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, ...
## $ Description <chr> "Forestry and logging", "Forestry and logging", "Forest...
## $ GeoName <chr> "United States", "Alabama", "Alaska", "Arizona", "Arkan...
## $ `1998` <chr> "168100", "10193", "1397", "598", "7248", "7200", "698"...
## $ `1999` <chr> "168700", "9637", "1455", "(T)", "7266", "7273", "727",...
## $ `2000` <chr> "153300", "8852", "1307", "(T)", "6662", "6208", "693",...
## $ `2001` <chr> "148400", "(D)", "986", "(D)", "6087", "6139", "637", "...
## $ `2002` <chr> "144600", "8347", "858", "(D)", "(D)", "5917", "(D)", "...
## $ `2003` <chr> "133800", "7929", "800", "(D)", "5716", "5340", "(D)", ...
## $ `2004` <chr> "140100", "8222", "(D)", "363", "(D)", "5545", "(D)", "...
## $ `2005` <chr> "147300", "8187", "790", "(D)", "5949", "5776", "(D)", ...
## $ `2006` <chr> "142700", "8124", "623", "(D)", "(D)", "5610", "(D)", "...
## $ `2007` <chr> "138100", "7800", "520", "(D)", "(D)", "5656", "(D)", "...
## $ `2008` <chr> "136900", "7850", "460", "(D)", "5095", "5861", "(D)", ...
## $ `2009` <chr> "119800", "7056", "440", "318", "4408", "4841", "(D)", ...
## $ `2010` <chr> "111600", "6741", "(D)", "(D)", "4160", "4626", "579", ...
## $ `2011` <chr> "116800", "6962", "(D)", "298", "4229", "4991", "558", ...
## $ `2012` <chr> "134600", "7728", "505", "(D)", "(D)", "5603", "726", "...
## $ `2013` <chr> "138000", "7970", "501", "388", "(D)", "5566", "755", "...
## $ `2014` <chr> "133000", "7843", "525", "(D)", "(D)", "5256", "699", "...
## $ `2015` <chr> "141300", "8059", "534", "(D)", "(D)", "5564", "(D)", "...
## $ `2016` <chr> "138800", "7892", "503", "(D)", "4890", "5266", "795", ...
## $ `2017` <chr> "123900", "7107", "(D)", "(D)", "4405", "4975", "691", ...
## $ `2018` <chr> "117600", "6759", "401", "379", "4248", "4796", "641", ...
## Observations: 118
## Variables: 3
## $ LineCode <fct> 10, 20, 40, 50, 60, 70, 80, 90, 100, 101, 102, 103, 200...
## $ Description <fct> "Total employment (number of jobs)", "Wage and salary e...
## $ NA <fct> "A count of jobs, both full-time and part-time. It incl...
3.3 Load CPI Data
file_path <- "../data/cpi/archive/"
## Read files named *.csv
filenames <- list.files(path=file_path,
pattern="*.csv",
full.names = TRUE)
#print(filenames)
##Create list of data frame names without the ".csv" part
names <- gsub(pattern='\\.',
replacement='_', # remove the last 4 characters from filename (_clean)
x=(sub(pattern = "(.*)\\..*$", replacement = "\\1", # remove extenson from filename (.csv)
basename(filenames) # get filename without the directory file path
)
)
)
#print(names)
###Load all files
for(i in filenames){
## Create list of data frame names without the ".csv" part
name <- gsub(pattern='\\.',
replacement='_', # remove the last 4 characters from filename (_clean)
x=(sub(pattern = "(.*)\\..*$", replacement = "\\1", # remove extenson from filename (.csv)
basename(i) # get filename without the directory file path
)
)
)
print(name)
assign(x=name, value=data.frame(read.csv(i)))
#DT::datatable(name)
}## [1] "cu_area"
## [1] "cu_base"
## [1] "cu_data_0_Current"
## [1] "cu_data_1_AllItems"
## [1] "cu_data_10_OtherWest"
## [1] "cu_data_11_USFoodBeverage"
## [1] "cu_data_12_USHousing"
## [1] "cu_data_13_USApparel"
## [1] "cu_data_14_USTransportation"
## [1] "cu_data_15_USMedical"
## [1] "cu_data_16_USRecreation"
## [1] "cu_data_17_USEducationAndCommunication"
## [1] "cu_data_18_USOtherGoodsAndServices"
## [1] "cu_data_19_PopulationSize"
## [1] "cu_data_2_Summaries"
## [1] "cu_data_20_USCommoditiesServicesSpecial"
## [1] "cu_data_3_AsizeNorthEast"
## [1] "cu_data_4_AsizeNorthCentral"
## [1] "cu_data_5_AsizeSouth"
## [1] "cu_data_6_AsizeWest"
## [1] "cu_data_7_OtherNorthEast"
## [1] "cu_data_8_OtherNorthCentral"
## [1] "cu_data_9_OtherSouth"
## [1] "cu_footnote"
## [1] "cu_item"
## [1] "cu_period"
## [1] "cu_periodicity"
## [1] "cu_series"
3.4 Show CPI Reference Data
3.4.1 Area
3.4.2 Base
3.4.3 Item
3.4.4 Periodicity
3.4.5 Series
3.4.6 Period
3.5 Combined Reference Data
3.6 Show CPI Data from Select Industry
3.6.1 CPI - US Food Beverages
FoodItemCode <- cu_reference_data %>%
filter(grepl('SAF|SEF|SS0|SS1|SS20', item_code))
#dplyr::filter(item_code %like% 'SAH' | item_code %like% 'SEH')
DT::datatable((FoodItemCode))cu_data_USFoodBeverage = cu_data_0_Current %>%
inner_join(FoodItemCode, by="series_id") %>%
inner_join(cu_period, by="period") %>%
filter(year %in% (1998:2018)) %>%
select(c("series_id",
"series_title",
"year",
"period",
"period_abbr",
"period_name",
"value",
"area_code",
"area_name",
"area_display_level",
"area_selectable",
"area_sort_sequence",
"item_code",
"item_name",
"item_display_level",
"item_selectable",
"item_sort_sequence",
"seasonal",
"periodicity_code",
"periodicity_name",
"base_code",
"base_name",
"base_period"))
DT::datatable(head(cu_data_USFoodBeverage))3.6.2 CPI - US Housing
HousingItemCode <- cu_reference_data %>%
#filter(grepl('SAH|SEH', item_code, fixed = TRUE))
filter(grepl('SAH|SEH', item_code))
#dplyr::filter(item_code %like% 'SAH' | item_code %like% 'SEH')
DT::datatable((HousingItemCode))cu_data_USHousing = cu_data_0_Current %>%
inner_join(HousingItemCode, by="series_id") %>%
inner_join(cu_period, by="period") %>%
#filter(item_code %like% c('SAH','SEH')) %>%
filter(year %in% (1998:2018)) %>%
select(c("series_id",
"series_title",
"year",
"period",
"period_abbr",
"period_name",
"value",
"area_code",
"area_name",
"area_display_level",
"area_selectable",
"area_sort_sequence",
"item_code",
"item_name",
"item_display_level",
"item_selectable",
"item_sort_sequence",
"seasonal",
"periodicity_code",
"periodicity_name",
"base_code",
"base_name",
"base_period"))
DT::datatable(head(cu_data_USHousing))3.6.3 CPI - US Transportation
TransportItemCode <- cu_reference_data %>%
filter(grepl('SAT|SET|SS4|SS52|SS53|SAS24|SAS4', item_code))
#dplyr::filter(item_code %like% 'SAH' | item_code %like% 'SEH')
DT::datatable((TransportItemCode))cu_data_USTransportation = cu_data_0_Current %>%
inner_join(TransportItemCode, by="series_id") %>%
inner_join(cu_period, by="period") %>%
filter(year %in% (1998:2018)) %>%
select(c("series_id",
"series_title",
"year",
"period",
"period_abbr",
"period_name",
"value",
"area_code",
"area_name",
"area_display_level",
"area_selectable",
"area_sort_sequence",
"item_code",
"item_name",
"item_display_level",
"item_selectable",
"item_sort_sequence",
"seasonal",
"periodicity_code",
"periodicity_name",
"base_code",
"base_name",
"base_period"))
DT::datatable(head(cu_data_USTransportation))3.6.4 CPI - US Medical
MedicalItemCode <- cu_reference_data %>%
filter(grepl('SAM|SEM|SS57', item_code))
#dplyr::filter(item_code %like% 'SAH' | item_code %like% 'SEH')
DT::datatable((MedicalItemCode))cu_data_USMedical = cu_data_0_Current %>%
inner_join(MedicalItemCode, by="series_id") %>%
inner_join(cu_period, by="period") %>%
filter(year %in% (1998:2018)) %>%
select(c("series_id",
"series_title",
"year",
"period",
"period_abbr",
"period_name",
"value",
"area_code",
"area_name",
"area_display_level",
"area_selectable",
"area_sort_sequence",
"item_code",
"item_name",
"item_display_level",
"item_selectable",
"item_sort_sequence",
"seasonal",
"periodicity_code",
"periodicity_name",
"base_code",
"base_name",
"base_period"))
DT::datatable(head(cu_data_USMedical))3.6.5 CPI - US Education And Communication
EducationItemCode <- cu_reference_data %>%
filter(grepl('SAE|SEE|SSE|SS27', item_code))
#dplyr::filter(item_code %like% 'SAH' | item_code %like% 'SEH')
DT::datatable((EducationItemCode))cu_data_USEducationAndCommunication = cu_data_0_Current %>%
inner_join(EducationItemCode, by="series_id") %>%
inner_join(cu_period, by="period") %>%
filter(year %in% (1998:2018)) %>%
select(c("series_id",
"series_title",
"year",
"period",
"period_abbr",
"period_name",
"value",
"area_code",
"area_name",
"area_display_level",
"area_selectable",
"area_sort_sequence",
"item_code",
"item_name",
"item_display_level",
"item_selectable",
"item_sort_sequence",
"seasonal",
"periodicity_code",
"periodicity_name",
"base_code",
"base_name",
"base_period"))
DT::datatable(head(cu_data_USEducationAndCommunication))4 Data Mapping - Employment vs CPI
4.1 CPI Data by Industry
4.1.1 Food & Beverages
cu_emp_data_USFoodBeverage = cu_data_USFoodBeverage %>%
mutate(parent_item_code = 'SAF', emp_line_code = 705) %>%
select(c("parent_item_code",
"emp_line_code",
"series_id",
"series_title",
"year",
"period",
"period_abbr",
"period_name",
"value",
"area_code",
"area_name",
"area_display_level",
"area_selectable",
"area_sort_sequence",
"item_code",
"item_name",
"item_display_level",
"item_selectable",
"item_sort_sequence",
"seasonal",
"periodicity_code",
"periodicity_name",
"base_code",
"base_name",
"base_period"))
DT::datatable(head(cu_emp_data_USFoodBeverage))4.1.2 Housing
cu_emp_data_USHousing = cu_data_USHousing %>%
mutate(parent_item_code = 'SAH', emp_line_code = 1100) %>%
select(c("parent_item_code",
"emp_line_code",
"series_id",
"series_title",
"year",
"period",
"period_abbr",
"period_name",
"value",
"area_code",
"area_name",
"area_display_level",
"area_selectable",
"area_sort_sequence",
"item_code",
"item_name",
"item_display_level",
"item_selectable",
"item_sort_sequence",
"seasonal",
"periodicity_code",
"periodicity_name",
"base_code",
"base_name",
"base_period"))
DT::datatable(head(cu_emp_data_USHousing))4.1.3 Transportation
cu_emp_data_USTransportation = cu_data_USTransportation %>%
mutate(parent_item_code = 'SAT', emp_line_code = 800) %>%
select(c("parent_item_code",
"emp_line_code",
"series_id",
"series_title",
"year",
"period",
"period_abbr",
"period_name",
"value",
"area_code",
"area_name",
"area_display_level",
"area_selectable",
"area_sort_sequence",
"item_code",
"item_name",
"item_display_level",
"item_selectable",
"item_sort_sequence",
"seasonal",
"periodicity_code",
"periodicity_name",
"base_code",
"base_name",
"base_period"))
DT::datatable(head(cu_emp_data_USTransportation))4.1.4 Medical
cu_emp_data_USMedical = cu_data_USMedical %>%
mutate(parent_item_code = 'SAM', emp_line_code = 1600) %>%
select(c("parent_item_code",
"emp_line_code",
"series_id",
"series_title",
"year",
"period",
"period_abbr",
"period_name",
"value",
"area_code",
"area_name",
"area_display_level",
"area_selectable",
"area_sort_sequence",
"item_code",
"item_name",
"item_display_level",
"item_selectable",
"item_sort_sequence",
"seasonal",
"periodicity_code",
"periodicity_name",
"base_code",
"base_name",
"base_period"))
DT::datatable(head(cu_emp_data_USMedical))4.1.5 Education & Communication
cu_data_USEducationAndCommunication = cu_data_USEducationAndCommunication %>%
mutate(parent_item_code = 'SAE', emp_line_code = 1500) %>%
select(c("parent_item_code",
"emp_line_code",
"series_id",
"series_title",
"year",
"period",
"period_abbr",
"period_name",
"value",
"area_code",
"area_name",
"area_display_level",
"area_selectable",
"area_sort_sequence",
"item_code",
"item_name",
"item_display_level",
"item_selectable",
"item_sort_sequence",
"seasonal",
"periodicity_code",
"periodicity_name",
"base_code",
"base_name",
"base_period"))
DT::datatable(head(cu_data_USEducationAndCommunication))